/* Additional cleaning for individual level data. */



#delimit ;
clear all;
local outfile "GWgap_clean_ind_v3";
set more off;


di _n "$S_DATE $S_TIME";








*1*******************************************************************************;
* making HLFS data set at the annual level to merge for individual wage regs;
********************************************************************************;

* reading in obs from quarterly HLS data set with positive hours worked;
use if hrs_main>0 & hrs_main<. using Wgap_HLFS_IDI_quart_v3_recent, clear;

sort snz_uid year quarter;

* collapsing to one obs per employed person financial year;

replace hhcomp = . if hhcomp==9;

replace hrs_oth = 0 if hrs_main<. & hrs_main==hrs_tot;

 
collapse (min) female 
	(max) hqual_to12 hqual_fr13 par_u18_ct multi_job ft nz_born hhcomp
	(mean) age hrs_main hrs_oth hrs_tot 
	(first) occ2d occ3d ind2 rc 
	(sum) HLFS_wgt
	(count) num_quart = quarter,
	by(snz_uid fin_yr);
	

* labelling;

foreach var of varlist female - num_quart {;
	notes `var': From HLFS.;
};


replace hhcomp = 9 if hhcomp==.;

label define hqual 0 "No qualifications" 1 "School qualifications" 2 "Post-school qualifications" 3 "Degree";
label val hqual_to12 hqual;
label val hqual_fr13 hqual;

label val hhcomp hhcomp;

label var female "Female";
label var hqual_to12 "Highest qualification (years <=2013)";
label var hqual_fr13 "Highest qualification (years >=2014)";
label var par_u18_ct "Parent of number of dependent children (max in the year)";
label var multi_job "Ever reported multiple jobs in HLFS in the year";
label var ft "Ever worked full time in the year";
label var nz_born "New Zealand born";
label var hhcomp "Household composition";
label var age "Average age in the year";
label var hrs_main "Average hours in main job";
label var hrs_oth "Average hours in non-main jobs";
label var hrs_tot "Average total hours in all jobs";
label var occ2d "Occupation (2-digit ANZSCO 2006, first in year)";
label var occ3d "Occupation (3-digit ANZSCO 2006, first in year)";
label var ind2 "Industry (ANZSIC 2006 level 2, first in year)";
label var rc "Regional Council (first in year)";
label var HLFS_wgt "Sum of HLFS weights in included quarters";
label var num_quart "Number of quarters included from HLFS data";

rename fin_yr year;
label var year "Financial year";

replace age = . if age<16 | age>80;
notes age: Set to missing if under 16 or over 80;


compress;
notes: Wgap_HLFS_IDI_finyr_v3.dta was created by `outfile'.do from Wgap_HLFS_IDI_quart_v3_recent.dta.
It contains an observation per person-financial year for those observed in the year with
positive hours only. Values are based on quarters observed working.;
save Wgap_HLFS_IDI_finyr_v3, replace;





use Wgap_HLFS_IDI_finyr_v3, clear;

foreach var of varlist female - num_quart {;
	rename `var' `var'_HLFS;
};

keep if year<=2016;
save temp_HLFS, replace;







*2*******************************************************************************;
* generating variables from IDI/LBD for use in individual regressions and merging with HLFS annual data;
********************************************************************************;


forvalues i = 1/5 {;

	use if year<=2016 using Wgap_v3_ind`i', clear;
	
	compress;
	
	* basic variables;
	
	rename min_age age;
	label var age "Age";
	notes age: Age at start of March year;

	replace tot_fte_employee_av = . if tot_fte_employee_av>1;
	notes drop tot_fte_employee_av;
	
	* converting all earning variables to real values;
	
	merge m:1 year using CPI, keep(master match) nogen;
	foreach var in tot_gross_earn_yr max_gross_earn_yr {;
		replace `var' = `var'*deflator;
		notes `var': Real 2006 $;
	};
	
	gen lnWpm = ln(tot_gross_earn_yr/tot_mon_wkd);
	label var lnWpm "Average wage/salary earnings per month worked (ln real)";
	
	gen lnWpm_hp = ln(max_gross_earn_yr/max_mon_wkd);
	label var lnWpm_hp "Average wage/salary earnings per month worked at hp pent (ln real)";
	
	gen indiv_avfte = 12*tot_fte_employee_av/tot_mon_wkd if tot_fte_employee_av<=1;
	label var indiv_avfte "Average FTEs in each month worked";
	
	gen indiv_avfte_hp = 12*max_fte_employee_av/max_mon_wkd if max_fte_employee_av<=1;
	label var indiv_avfte_hp "Average FTEs at hp pent in each month worked at hp pent";

	recode num_pents (1 = 1) (2 = 2) (3 4 5 = 3) (nonmiss = 4), gen(num_pents_cat);
	label define num_pents_cat 1 "1" 2 "2" 3 "3-5" 4 ">5";
	label val num_pents_cat num_pents_cat;
	label var num_pents_cat "Number of pents worked at during year (categories)";
	
	gen full_yr = tot_mon_wkd==12 if tot_mon_wkd<.;
	label var full_yr "Worked 12 months of year";
	
	gen full_yr_hp = max_mon_wkd==12 if max_mon_wkd<.;
	label var full_yr_hp "Worked 12 months of year at hp pent";
	
	gen ft_hp = max_mon_wkd_ft/(max_mon_wkd_ft + max_mon_wkd_pt) 
		if max_mon_wkd_ft + max_mon_wkd_pt==max_mon_wkd;
	label var ft_hp "Fraction of employee's months at hp pent worked full time";
	
	
	* firm size variables;
	
	gen hc10 = L_hc>=10 if L_hc<.;
	label var hc10 "Pent average head count is 10+";
	
	gen fte10 = L_fte>=10 if L_fte<.;
	label var fte10 "Pent average FTE is 10+";
	
	gen lnL_hc = ln(L_hc);
	label var lnL_hc "Pent average head count (ln)";
	
	gen lnL_fte = ln(L_fte);
	label var lnL_fte "Pent average FTE (ln)";

	recode L_WP (0 = 0) (1 = 1) (2/5 = 2) (6/10 = 3) (nonmiss = 4), gen(WP_cat);
	label define WP_cat 0 "0" 1 "1" 2 "2-5" 3 "6-10" 4 ">10";
	label val WP_cat WP_cat;
	label var WP_cat "Working proprietor headcount (categories)";
	


	
	* making industry variables at different levels;

	egen ind4 = group(anz06_4d);
	label var ind4 "4-digit ANZSIC industry";
	
	gen anz06_3d = substr(anz06_4d,1,4);
	gen anz06_2d = substr(anz06_4d,1,3);
	gen anz06_1d = substr(anz06_4d,1,2);

	forvalues j = 1/3 {;
		egen ind`j' = group(anz06_`j'd);
		label var ind`j' "`j'-digit ANZSIC industry";
	};
	drop anz06_1d anz06_2d anz06_3d;
	

	label var wkd_hpp_2ya "Worked for highest paying pent for last 2 yrs";
	
	* saving a version with fewer variables for the IDI sample;
	
	preserve;
	drop max_mon_wkd_ft max_mon_wkd_pt L_hc_ft L_hc_pt av_fte_m 
		av_fte_f s_f_fte deflator ft_hp hc10 fte10 snz_ird_uid WP L_fte 
		L_WP rand WP_cat ind1 ind2 ind3;
	compress;
	save temp_IDIv`i', replace;
	restore;
	
	
	* merging on HLFS data;
	
	di "Variables in annual IDI individul level data";
	des;
	
	di "Variables in HLFS data to be merged on";
	des using temp_HLFS;
	
	merge 1:1 snz_uid year using temp_HLFS, keep(master match using);

	keep if _m==3;
	di "i = `i', observations = ";
	count;
	
	compress;
	
	save temp`i', replace;
	clear;

};



**** appending all IDI data matched to HLFS into one data set;

clear; 
forvalues i = 1/5 {;

	append using temp`i';
	duplicates drop;
	isid snz_uid year;
	
};

keep if year<=2016;
drop _m;






* making age category dummies that match ranges used in firm level regs;

gen age_cat = .;
replace age_cat = 1 if age<25;
replace age_cat = 2 if age>=25 & age<40;
replace age_cat = 3 if age>=40 & age<55;
replace age_cat = 4 if age>=55 & age<.;
label define age_cat 1 "<25" 2 "25 to 39" 3 "40 to 55" 4 "55+";
label val age_cat age_cat;
label var age_cat "Age category";
notes age_cat: Based on age at start of financial year;
	
notes: GWgap_HLFS_indivregdata.dta was created by `outfile'.do on $S_DATE at $S_TIME.
It is at the person-financial year level, for workers who match between the HLFS and IDI.;
save GWgap_HLFS_indivregdata, replace;







*3*******************************************************************************;
* making a file with fewer variables but including all IDI observations to use in individual IDI regs;
********************************************************************************;

clear; 
forvalues i = 1/5 {;

	append using temp_IDIv`i';
	duplicates drop;
	isid snz_uid year;
	
};

keep if year<=2016;


* making age category dummies that match ranges used in firm level regs;

gen age_cat = .;
replace age_cat = 1 if age<25;
replace age_cat = 2 if age>=25 & age<40;
replace age_cat = 3 if age>=40 & age<55;
replace age_cat = 4 if age>=55 & age<.;
label define age_cat 1 "<25" 2 "25 to 39" 3 "40 to 55" 4 "55+";
label val age_cat age_cat;
label var age_cat "Age category";
notes age_cat: Based on age at start of financial year;

compress;

#delimit ;
notes: GWgap_IDI_indivregdata.dta was created by `outfile'.do on $S_DATE at $S_TIME.
It is at the person-financial year level, for all IDI workers.;
save GWgap_IDI_indivregdata, replace;






